In [1]:
    
import glob
import os
import random
import subprocess
import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pybedtools as pbt
import seaborn as sns
import statsmodels.stats.multitest as smm
import vcf as pyvcf
import cardipspy as cpy
import ciepy
%matplotlib inline
%load_ext rpy2.ipython
dy_name = 'rare_variant_eqtls'
import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
    
In [2]:
    
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls01', 'lead_variants.tsv')
lead_vars = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls02', 'lead_variants.tsv')
lead_vars2 = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls03', 'lead_variants.tsv')
lead_vars3 = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'unrelated_eqtls01', 'lead_variants.tsv')
unr_lead_vars = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
rna_meta_eqtl = rna_meta[rna_meta.in_eqtl]
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
subject_meta_eqtl = subject_meta.ix[rna_meta_eqtl.subject_id]
rna_meta_eqtl = rna_meta_eqtl.merge(subject_meta_eqtl, left_on='subject_id', right_index=True)
    
In [3]:
    
lead_vars = lead_vars[lead_vars.perm_sig]
lead_vars = lead_vars[lead_vars.variant_type == 'snv']
lead_vars2 = lead_vars2[lead_vars2.perm_sig]
lead_vars2 = lead_vars2[lead_vars2.variant_type == 'snv']
lead_vars3 = lead_vars3[lead_vars3.perm_sig]
lead_vars3 = lead_vars3[lead_vars3.variant_type == 'snv']
unr_lead_vars = unr_lead_vars[unr_lead_vars.perm_sig]
unr_lead_vars = unr_lead_vars[unr_lead_vars.variant_type == 'snv']
    
In [4]:
    
af_cols = [x for x in lead_vars.columns if '_AF' in x]
lead_vars_af = lead_vars.dropna(subset=af_cols)
lead_vars2_af = lead_vars2.dropna(subset=af_cols)
lead_vars3_af = lead_vars3.dropna(subset=af_cols)
    
In [5]:
    
lead_vars[af_cols] = lead_vars[af_cols].fillna(0)
    
In [6]:
    
lead_vars_af.AF.hist()
plt.ylabel('Number of variants')
plt.xlabel('Alternate allele frequency');
    
    
The allele frequencies from 1,000 Genomes are for the alternate allele, so I need to take care when I define which allele is minor.
In [7]:
    
a = sum((lead_vars_af[af_cols] < 0.005).sum(axis=1) == len(af_cols))
b = sum((lead_vars_af[af_cols] > 1 - 0.005).sum(axis=1) == len(af_cols))
print('There are {} variants where the alternate is rare and {} '
      'where the reference is rare.'.format(a, b))
    
    
It doesn't seem like any of my eQTL lead variants that are in 1000 Genomes have rare reference alleles, so I can say that rare variants have the alternate allele as the minor allele.
In [8]:
    
lead_vars_af['rare'] = False
lead_vars_af.ix[lead_vars_af[(lead_vars_af[af_cols] < 0.005).sum(axis=1) == len(af_cols)].index, 'rare'] = True
a = lead_vars_af.rare.sum()
b = len(set(lead_vars_af.ix[lead_vars_af.rare, 'gene_id']))
print('There are {} rare lead variants for {} eGenes.'.format(a, b))
    
    
In [9]:
    
se = lead_vars_af[lead_vars_af.rare].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='rare', alpha=0.5, weights=weights, histtype='stepfilled')
se = lead_vars_af[lead_vars_af.rare == False].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='not rare', alpha=0.5, weights=weights, histtype='stepfilled')
plt.xlim(-3, 3)
plt.xlabel('Beta')
plt.ylabel('Fraction of lead variants')
plt.title('All samples')
plt.legend();
    
    
In [10]:
    
af_cols = [x for x in unr_lead_vars.columns if '_AF' in x]
unr_lead_vars_af = unr_lead_vars.dropna(subset=af_cols)
unr_lead_vars_af['rare'] = False
unr_lead_vars_af.ix[unr_lead_vars_af[(unr_lead_vars_af[af_cols] < 0.005).sum(axis=1) == 
                                     len(af_cols)].index, 'rare'] = True
    
In [11]:
    
se = unr_lead_vars_af[unr_lead_vars_af.rare].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='rare', alpha=0.5, weights=weights, histtype='stepfilled')
se = unr_lead_vars_af[unr_lead_vars_af.rare == False].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='not rare', alpha=0.5, weights=weights, histtype='stepfilled')
plt.xlim(-3, 3)
plt.xlabel('Beta')
plt.ylabel('Fraction of lead variants')
plt.title('Unrelateds eQTL')
plt.legend();
    
    
In [12]:
    
def get_genotypes(df):
    genotypes = pd.DataFrame(np.nan, index=df.index, columns=rna_meta_eqtl.wgs_id)
    fn = '/projects/CARDIPS/analysis/cardips-ipsc-eqtl/private_output/eqtl_input/filtered_all/0000.vcf.gz'
    vcf_reader = pyvcf.VCFReader(open(fn))
    count = 0
    for i in df.index:
        if count % 1000 == 0:
            print(count)
        res = vcf_reader.fetch(df.ix[i, 'chrom'][3:], df.ix[i, 'end'],
                               df.ix[i, 'end'])
        r = res.next()
        hom_refs = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hom_refs()])
        hets = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hets()])
        hom_alts = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hom_alts()])
        genotypes.ix[i, hom_refs] = 0
        genotypes.ix[i, hets] = 1
        genotypes.ix[i, hom_alts] = 2
        count += 1
    return genotypes
    
In [13]:
    
out = os.path.join(private_outdir, 'genotypes.tsv')
if not os.path.exists(out):
    genotypes = get_genotypes(lead_vars_af)
    genotypes.to_csv(out, sep='\t')
else:
    genotypes = pd.read_table(out, index_col=0)
    
out = os.path.join(private_outdir, 'genotypes2.tsv')
if not os.path.exists(out):
    genotypes2 = get_genotypes(lead_vars2_af)
    genotypes2.to_csv(out, sep='\t')
else:
    genotypes2 = pd.read_table(out, index_col=0)
    
out = os.path.join(private_outdir, 'genotypes3.tsv')
if not os.path.exists(out):
    genotypes3 = get_genotypes(lead_vars3_af)
    genotypes3.to_csv(out, sep='\t')
else:
    genotypes3 = pd.read_table(out, index_col=0)
    
In [14]:
    
genotypes_f = genotypes.ix[lead_vars_af.index, rna_meta_eqtl.wgs_id]
    
In [15]:
    
lead_vars_af['ref_is_major'] = lead_vars_af.ac < lead_vars_af.ns
    
In [16]:
    
t = lead_vars_af[lead_vars_af.rare]
a = t[t.AF == 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50)
a = t[t.AF != 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50, color='red')
plt.ylim(-0.0001, 0.002)
plt.ylabel('1000 Genomes MAF')
plt.xlabel('CARDiPS MAF');
    
    
In [17]:
    
t = lead_vars_af[lead_vars_af.rare]
a = t[t.AF == 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50)
a = t[t.AF != 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50, color='red')
plt.ylim(-0.0001, 0.002)
plt.ylabel('1000 Genomes MAF')
plt.xlabel('CARDiPS MAF');
plt.xlim(0, 0.05)
    
    Out[17]:
    
In [18]:
    
lead_vars_uniq = lead_vars.drop_duplicates(subset='location')
lead_vars_uniq['minor_AF'] = lead_vars_uniq.AF
lead_vars_uniq.ix[lead_vars_uniq.minor_AF > 0.5, 'minor_AF'] = \
    1 - lead_vars_uniq.ix[lead_vars_uniq.minor_AF > 0.5, 'minor_AF']
lead_vars_uniq['ref_is_major'] = lead_vars_uniq.ac < lead_vars_uniq.ns
    
In [24]:
    
a = (lead_vars_uniq.AF == 0).value_counts()[False]
b = len(set(lead_vars_uniq.gene_id))
print('Obtained allele frequency for {:,} lead SNVs from {:,} eGenes from 1000 Genomes.\n'.format(a, b))
a = len(set(lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].gene_id))
nr = len(set(lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].location))
print('{:,} lead SNVs from {:,} eGenes were rare (MAF < 0.5%) in 1000 Genomes.'.format(nr, a))
n = (lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].maf > 0.05).value_counts()[True]
print('{:,} of these {:,} lead SNVs ({:.1f}%) are not rare among the 215 donors.'.format(n, nr, 100 * n / float(nr)))
a = (lead_vars_uniq.AF == 0).value_counts()[True]
print('{:,} of these {:,} lead SNVs were not seen in 1000 Genomes.'.format(a, nr))
a = len(set(lead_vars_uniq[(lead_vars_uniq.minor_AF < 0.005) & 
                           (lead_vars_uniq.minor_AF > 0)].gene_id))
b = len(set(lead_vars_uniq[(lead_vars_uniq.minor_AF < 0.005) &
                           (lead_vars_uniq.minor_AF > 0)].location))
print('{:,} of these {:,} lead SNVs from {:,} eGenes were observed in 1000 Genomes but were '
      'rare (MAF < 0.5%).\n'.format(b, nr, a))
    
    
In [25]:
    
jg = sns.jointplot(lead_vars_uniq.maf, lead_vars_uniq.minor_AF, alpha=0.2, stat_func=None);
    
    
In [48]:
    
lead_vars_uniq.to_csv(os.path.join(outdir, 'lead_vars_uniq.tsv'), sep='\t')
    
In [29]:
    
wgs_meta = subject_meta.copy(deep=True)
wgs_meta = wgs_meta.ix[rna_meta[rna_meta.in_eqtl].subject_id]
se = pd.Series(rna_meta[rna_meta.in_eqtl].wgs_id.values, index=rna_meta[rna_meta.in_eqtl].subject_id)
wgs_meta.index = se[list(wgs_meta.index)].values
    
In [30]:
    
family_vc = subject_meta_eqtl.family_id.value_counts()
family_vc = family_vc[family_vc * 2 >= lead_vars_uniq.ac.min()]
    
In [31]:
    
for i in family_vc.index:
    se = genotypes.ix[lead_vars_uniq.index, 
                      rna_meta_eqtl[rna_meta_eqtl.family_id == i].wgs_id].sum(axis=1)
    c = '{}_maf'.format(i)
    lead_vars_uniq[c] = np.nan
    lead_vars_uniq.ix[lead_vars_uniq.ref_is_major, c] = se[lead_vars_uniq.ref_is_major.values] / (family_vc[i] * 2)
    lead_vars_uniq.ix[lead_vars_uniq.ref_is_major == False, c] = \
        (family_vc[i] * 2 - se[(lead_vars_uniq.ref_is_major == False).values]) / (family_vc[i] * 2)
    c = '{}_mac'.format(i)
    lead_vars_uniq.ix[lead_vars_uniq.ref_is_major, c] = se[lead_vars_uniq.ref_is_major.values]
    lead_vars_uniq.ix[(lead_vars_uniq.ref_is_major == False).values, c] = \
        (family_vc[i] * 2 - se[(lead_vars_uniq.ref_is_major == False).values])
    
In [38]:
    
t = lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005]
num_fams = []
for i in t.index:
    se = genotypes.ix[i]
    mode = se.mode().values[0]
    num_fams.append(len(set(wgs_meta.ix[se[se != mode].index, 'family_id'])))
num_fams = pd.Series(num_fams)
a = sum(num_fams >= 2)
print('{:,} of {:,} rare lead variants were observed in more than one family.'.format(a, t.shape[0]))
    
    
In [47]:
    
ax = num_fams.value_counts().sort_index().plot.bar()
ax.set_ylabel('Number of lead SNVs')
ax.set_xlabel('Number of families observed in');
    
    
In [49]:
    
num_fams.to_csv(os.path.join(outdir, 'rare_num_fams.tsv'), sep='\t')
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
family_vc = subject_meta_eqtl.family_id.value_counts()
family_vc = family_vc[family_vc * 2 >= lead_vars_af.ac.min()]
    
In [21]:
    
for i in family_vc.index:
    se = genotypes_f[rna_meta_eqtl[rna_meta_eqtl.family_id == i].wgs_id].sum(axis=1)
    c = '{}_maf'.format(i)
    lead_vars_af[c] = np.nan
    lead_vars_af.ix[lead_vars_af.ref_is_major, c] = se[lead_vars_af.ref_is_major] / (family_vc[i] * 2)
    lead_vars_af.ix[lead_vars_af.ref_is_major == False, c] = \
        (family_vc[i] * 2 - se[lead_vars_af.ref_is_major == False]) / (family_vc[i] * 2)
    c = '{}_mac'.format(i)
    lead_vars_af.ix[lead_vars_af.ref_is_major, c] = se[lead_vars_af.ref_is_major]
    lead_vars_af.ix[lead_vars_af.ref_is_major == False, c] = \
        (family_vc[i] * 2 - se[lead_vars_af.ref_is_major == False])
    
In [22]:
    
c = '{}_maf'.format(family_vc.index[1])
t = lead_vars_af[lead_vars_af[c] > lead_vars_af.maf]
t = t[t.rare]
t = t[t.maf < 0.05]
    
In [23]:
    
plt.scatter(t.maf, t[c])
    
    Out[23]:
    
In [24]:
    
for i in family_vc.index:
    c = '{}_mac'.format(i)
    a = set(subject_meta.ix[subject_meta.family_id == i, 'ethnicity_group'])
    print(i, len(set(lead_vars_af.ix[lead_vars_af.ac == lead_vars_af[c], 'gene_id'])), list(a))
    
    
In [21]:
    
subject_meta[subject_meta.family_id == '84fda65d-9a06-4bbe-ad75-a24773724c32'].head()
    
    Out[21]:
In [21]:
    
lead_vars_af.to_csv(os.path.join(outdir, 'lead_vars_af.tsv'), sep='\t')
unr_lead_vars_af.to_csv(os.path.join(outdir, 'unr_lead_vars_af.tsv'), sep='\t')
    
In [28]:
    
tdf = lead_vars_af[lead_vars_af.rare]
    
In [31]:
    
plt.scatter(tdf.AF, tdf.beta)
    
    Out[31]:
    
In [46]:
    
t = lead_vars_af[lead_vars_af.ac == lead_vars_af['84fda65d-9a06-4bbe-ad75-a24773724c32_mac']]
t.sort_values(by='pvalue', inplace=True)
t.drop_duplicates(subset=['gene_id']).T.head(26)
    
    Out[46]:
In [47]:
    
t = lead_vars_af[lead_vars_af.ac == lead_vars_af['abb401f1-5c3e-48ed-8c55-839ce2afe7e6_mac']]
t.sort_values(by='pvalue', inplace=True)
t.drop_duplicates(subset=['gene_id']).T.head(26)
    
    Out[47]:
TODO: Are any of the lead variants for the second or third eQTLs rare?
In [50]:
    
sum(lead_vars2.AF < 0.005)
    
    Out[50]:
In [51]:
    
sum(lead_vars3.AF < 0.005)
    
    Out[51]:
In [ ]: